Adapting methods described in the poster uploaded on Google Drive, I first calculated the mean pre-pubertal fE2 concentration. Because infants/juvs have elevated fE2 when nursing, we need to exclude these individuals when determining what typical pre-pubertal fE2 looks like.
Per Slack communication on 7/17: defining prebuteral E from individuals greater than 2.5 years (def not nursing) and under 3 (def not pubertal)
## calculating average pre-pubertal E2
prepubertal_data<-e_data |>
filter(age_at_sample >= 2.5 & age_at_sample <= 3) |> # we want age 2.5-3...def pre menstruation
select(ind_id, age_at_sample, e_conc_ug)
prepubertal_e_data <- prepubertal_data |>
summarise(mean_e_conc_ug = mean(e_conc_ug, na.rm = TRUE),
sd_e_conc_ug = sd(e_conc_ug, na.rm = TRUE))
mean_prepubertal_e_conc_ug <- prepubertal_e_data$mean_e_conc_ug
sd_prepubertal_e_conc_ug <- prepubertal_e_data$sd_e_conc_ug
Using the mean + 2SD approach:
calculated_treshold <- mean_prepubertal_e_conc_ug + 2*sd_prepubertal_e_conc_ug
print(paste("Calculated fE2 threshold for menarche: ", calculated_treshold, " ug/g"))
## [1] "Calculated fE2 threshold for menarche: 0.784032497466729 ug/g"
Looking at the average age at which the population has an E reading that crosses the threshold E. Restricting dataset to individuals >3 years old.
## creating df ----
e_data_age_at_menarche_df <- e_data |>
select(c(age_at_sample, age_at_sample_years, e_conc_ug)) |>
group_by(age_at_sample) |>
mutate(mean_e_at_age = mean(e_conc_ug, na.rm = TRUE),
se_e_at_age = sd(e_conc_ug, na.rm = TRUE) / sqrt(n())) |>
ungroup() |>
arrange(age_at_sample) |>
distinct() |>
group_by(age_at_sample_years) |>
mutate(mean_e_at_age_years = mean(e_conc_ug, na.rm = TRUE),
se_e_at_age_years = sd(e_conc_ug, na.rm = TRUE) / sqrt(n())) |>
ungroup() |>
select(-e_conc_ug) |>
arrange(age_at_sample_years) |>
distinct()
## average_age_at_menarche----
average_age_at_menarche <- e_data_age_at_menarche_df |>
filter(age_at_sample > 3) |>
filter(mean_e_at_age > calculated_treshold) |>
slice(1) |>
pull(age_at_sample)
# standard error of average_age_at_menarche
se_age_at_menarche <- e_data_age_at_menarche_df |>
filter(age_at_sample > 3) |>
filter(mean_e_at_age > calculated_treshold) |>
summarise(se_age_at_sample = sd(age_at_sample, na.rm = TRUE) / sqrt(n())) |>
pull(se_age_at_sample)
print(paste("Calculated average age at menarche: ", average_age_at_menarche, " years"))
## [1] "Calculated average age at menarche: 3.22 years"
print(paste("SE average age at menarche: ", se_age_at_menarche))
## [1] "SE average age at menarche: 0.343412350546469"
NOTE: This calculated age is definitely too young.
The E data is super not normal at the population-level, even after attempting log transformation….I am holding off on any real modeling for now
shapiro.test(e_data_age_at_menarche_df$mean_e_at_age)
##
## Shapiro-Wilk normality test
##
## data: e_data_age_at_menarche_df$mean_e_at_age
## W = 0.20031, p-value < 2.2e-16
shapiro.test(log(e_data_age_at_menarche_df$mean_e_at_age))
##
## Shapiro-Wilk normality test
##
## data: log(e_data_age_at_menarche_df$mean_e_at_age)
## W = 0.99535, p-value = 0.008119
For interpretability, this plot just has the data sorted into age-years, rather than exact age estimations with lots of decimals
## get sample size ----
n_figure_1 <- nrow(e_data)
##Figure 1
par(mfrow = c(1,1))
ggplot(data = e_data_age_at_menarche_df |>
select(age_at_sample_years, mean_e_at_age_years, se_e_at_age_years) |>
distinct(), aes(x = age_at_sample_years, y = mean_e_at_age_years)) +
geom_smooth() +
geom_point()+
geom_vline(xintercept = average_age_at_menarche, linetype = "dashed", color = "red") +
geom_hline(yintercept = calculated_treshold, linetype = "dashed", color = "skyblue") +
geom_errorbar(aes(ymin = mean_e_at_age_years - se_e_at_age_years, ymax = mean_e_at_age_years + se_e_at_age_years), width = 0.2) +
geom_rect(aes(xmin = average_age_at_menarche - 1.96*se_age_at_menarche, xmax = average_age_at_menarche + 1.96*se_age_at_menarche,
ymin = -Inf, ymax = Inf), alpha = 0.05, fill = "pink") +
ggtitle(paste0("Average estradiol by age at sample (N = ", n_figure_1, " samples)"))+
xlab("Age at sample")+
ylab("Fecal estradiol (ug/g)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Interpreting the graph:
The black circles and error bars represent the average age fE2 reading (with standard error) for each age-year for the population
The blue line represents the results of a Loess regression, a non-parametric approach to modeling the relationship between x and y. The gray shading around the blue line is the error of the estimate.
The dashed light blue line is the calculated prepubertal fE threshold (y = 0.7840325 ug/g)
The dashed red line is the calculated average age at puberty (x = 3.22 years)
The shaded area around the dashed red line is the 95% CI for the average age at puberty estimate (3.22 ± 1.96* SE)
NOTE: I haven’t yet calculated individual E thresholds/ages at menarche because I think we are going to want a different approach than mean + 2sd….still working on that.
I am pulling data on females that we have good E coverage for before and after potential menarche…I’m looking for IDs with samples starting at least by age 3. At least 25 samples per ID. That leaves us with the following:
par(mfrow = c(1,1))
# Panel of all individual females who have good E coverage before and after menarche
females_for_menarche_panel<-e_data |>
group_by(ind_id) |>
filter(min(age_at_sample)<3) |>
mutate(count = n()) |>
ungroup() |>
select(ind_id,count) |>
arrange(desc(count)) |>
distinct() |>
filter(count>25) |>
pull(ind_id)
females_for_menarche_panel_df<- e_data |>
filter(ind_id %in% females_for_menarche_panel) |>
filter(age_at_sample < 8) |>
select(ind_id, age_at_sample, sample_id, date, month, time, hour, e_conc_ug, is_pregnant) |>
arrange(ind_id, age_at_sample) |>
mutate(date = as.Date(date))
n_figure_2 <-nrow(females_for_menarche_panel_df)
# facet wrap plots with geom_smooth
ggplot(data = females_for_menarche_panel_df, aes(x = age_at_sample, y = log(e_conc_ug), color = ind_id)) +
geom_point()+
facet_wrap(~ ind_id)+
geom_smooth()+
ggtitle(paste0("Estradiol by age at sample, separated by individual (n = ", n_figure_2, " samples)"))+
xlab("Age at sample")+
ylab("Log-transformed fecal estradiol (log(ug/g))")+
theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Loop through each individual and create a ggplot for each
for (individual in females_for_menarche_panel) {
individual_data <- females_for_menarche_panel_df |> filter(ind_id == individual)
p <- ggplot(data = individual_data, aes(x = age_at_sample, y = log(e_conc_ug))) +
geom_point() +
ggtitle(paste0("Estradiol by age at sample for individual ", individual, " (n = ", nrow(individual_data), " samples)")) +
xlab("Age at sample") +
ylab("Log-transformed fecal estradiol (log(ug/g))") +
theme(legend.position = "none")+
geom_line(color = "blue")
print(p)
}
^Note that the y-axis is log-transformed on this plot
shapiro.test(females_for_menarche_panel_df$e_conc_ug) #nope
##
## Shapiro-Wilk normality test
##
## data: females_for_menarche_panel_df$e_conc_ug
## W = 0.46254, p-value < 2.2e-16
shapiro.test(log(females_for_menarche_panel_df$e_conc_ug)) # log transformation does the trick!
##
## Shapiro-Wilk normality test
##
## data: log(females_for_menarche_panel_df$e_conc_ug)
## W = 0.99592, p-value = 0.1774
Log transformation does the trick! Data is now normal for our young female panel.
I am including individual ID as a random effect. In unadjusted model,
fixed effects are month and time of day (accounting for seasonality and
diurnal variation) with individual ID included as a random effect. Any
other things to control for?
The adjusted model includes age as a fixed effect. Age improves the
model fit!
unadjusted_e_model <- lmer(log(e_conc_ug) ~ + month + hour+ (1 | ind_id), data = females_for_menarche_panel_df)
adjusted_e_model <- lmer(log(e_conc_ug) ~ age_at_sample + month + hour+ (1 | ind_id), data = females_for_menarche_panel_df)
anova(unadjusted_e_model, adjusted_e_model) # adjusted_e_model fits better
## refitting model(s) with ML (instead of REML)
## Data: females_for_menarche_panel_df
## Models:
## unadjusted_e_model: log(e_conc_ug) ~ +month + hour + (1 | ind_id)
## adjusted_e_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## unadjusted_e_model 5 1879.4 1900.8 -934.70 1869.4
## adjusted_e_model 6 1751.7 1777.5 -869.86 1739.7 129.68 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_e_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## Data: females_for_menarche_panel_df
##
## REML criterion at convergence: 1758.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.16124 -0.62234 0.00391 0.65935 2.72881
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 0.150 0.3874
## Residual 1.461 1.2087
## Number of obs: 537, groups: ind_id, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -2.78282 0.32894 -8.460
## age_at_sample 0.50977 0.04226 12.062
## month 0.08792 0.01492 5.894
## hour -0.04200 0.01934 -2.171
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month
## age_at_smpl -0.581
## month -0.257 0.022
## hour -0.616 -0.018 -0.031
#checking for multicollinearity -- low (~ 1)
vif(adjusted_e_model)
## age_at_sample month hour
## 1.000802 1.001467 1.001289
print("No multicollinearity -- yay!")
## [1] "No multicollinearity -- yay!"
Tldr: model passes diagnostics—yay
# checking model diagnostics
plot(adjusted_e_model) #resids vs leverage -- pretty good
lattice::qqmath(adjusted_e_model) ## qq norm plot -- good
plot(adjusted_e_model, # scale location plot -- pretty good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
NOTE: I am holding off on reporting on this until we have the updated behavioral data containing group scan observations of duck face….planning to adapt methods from Sen et al. 2022
NOTE: I am holding off on reporting on this until we have the updated behavioral data containing group scan observations of duck face….planning to adapt methods from Sen et al. 2022
par(mfrow = c(1,1))
# Panel of all individual females who have good E coverage before and after menarche
females_for_cycling_panel<-e_data |>
group_by(ind_id) |>
filter(min(age_at_sample)>3) |>
mutate(count = n()) |>
ungroup() |>
select(ind_id,count) |>
arrange(desc(count)) |>
distinct() |>
filter(count>25) |>
pull(ind_id)
females_for_cycling_panel_df<- e_data |>
filter(ind_id %in% females_for_cycling_panel) |>
mutate(date = as.Date(date)) |>
filter(date > as.Date("2021-12-31")) |>
select(ind_id, age_at_sample, sample_id, date, month, time, hour, e_conc_ug, is_pregnant) |>
arrange(ind_id, age_at_sample)
n_figure_2 <-nrow(females_for_cycling_panel_df)
create_continuous_segments <- function(df) {
df %>%
arrange(date) %>%
mutate(segment = cumsum(is_pregnant == "P") + 1) %>%
group_by(segment) %>%
filter(any(is_pregnant == "NP")) %>%
ungroup() %>%
mutate(segment = cumsum(is_pregnant == "P") + 1) %>%
filter(is_pregnant == "NP")
}
# Loop through each individual and create a ggplot for each
for (individual in females_for_cycling_panel) {
individual_data <- females_for_cycling_panel_df |> filter(ind_id == individual)
# Create continuous segments based on NP periods
segmented_data <- create_continuous_segments(individual_data)
p <- ggplot(data = individual_data, aes(x = date, y = log(e_conc_ug))) +
geom_point(aes(color = is_pregnant)) +
ggtitle(paste0("Estradiol by age at sample for individual ", individual, " (n = ", nrow(individual_data), " samples)")) +
xlab("Date") +
ylab("Log-transformed fecal estradiol (log(ug/g))") +
theme(legend.position = "none") +
geom_line(data = segmented_data, aes(group = segment), color = "blue") # Plot lines for continuous NP segments
print(p)
}
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
Could you please share the estimated/confirmed ages of all of our IDs? I can’t really assign this easily (unless I work backwards from age at sample info….). I know I had made a Google Sheet at some point with this information, but I no longer have access. Thanks!
Haven’t looked into this yet…stay tuned.
QUESTION: I see some individuals are marekd as
pregnant (e..g, is_pregnant = “P”), but the
preg_trim column is blank. What do I do about this?
For now, I’m excluding pregnant individuals with missing trimester info.
pregnant_df<-e_data |>
select(ind_id, month, hour, age_at_sample, e_conc_ug, preg_trim, pregnancy_num, is_pregnant) |>
filter(is_pregnant == "P") |>
filter(preg_trim !="")
n_pregnant_df <- nrow(pregnant_df)
Note: y-axis is log-transformed
par(mfrow = c(1,1))
ggplot(data = pregnant_df, aes(x = preg_trim, y = log(e_conc_ug)))+
geom_boxplot()+
geom_jitter(alpha = 0.1)+
ggtitle(paste("Estradiol and gestation (N = ", n_pregnant_df, "samples)")) +
xlab("Pregnancy trimester")+
ylab("Log-transformed fecal estradiol (log(ug/g))")
Data is normal following log transformation
shapiro.test(pregnant_df$e_conc_ug)
##
## Shapiro-Wilk normality test
##
## data: pregnant_df$e_conc_ug
## W = 0.33877, p-value < 2.2e-16
shapiro.test(log(pregnant_df$e_conc_ug))
##
## Shapiro-Wilk normality test
##
## data: log(pregnant_df$e_conc_ug)
## W = 0.99539, p-value = 0.6763
Among trimesters, we only see that T2 is significantly lower than T3; no significant difference between T1-T2 or T1-T3.
anova_result_pregancy_e <- aov(log(e_conc_ug) ~ preg_trim, data = pregnant_df)
summary(anova_result_pregancy_e)
## Df Sum Sq Mean Sq F value Pr(>F)
## preg_trim 2 17.7 8.853 5.964 0.00296 **
## Residuals 242 359.2 1.484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukey_results_pregnancy_e <- TukeyHSD(anova_result_pregancy_e)
print(tukey_results_pregnancy_e)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = log(e_conc_ug) ~ preg_trim, data = pregnant_df)
##
## $preg_trim
## diff lwr upr p adj
## T2-T1 -0.3732333 -0.8473855 0.1009188 0.1537996
## T3-T1 0.2486747 -0.2191212 0.7164707 0.4229352
## T3-T2 0.6219081 0.1957494 1.0480667 0.0019638
## what other control variables should I include?
unadjusted_pregnancy_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour+ (1 | ind_id), data = pregnant_df)
adjusted_pregnancy_model <- lmer(log(e_conc_ug) ~ age_at_sample + month + hour+preg_trim + (1 | ind_id), data = pregnant_df)
anova(unadjusted_pregnancy_model, adjusted_pregnancy_model) # adjusted model is better fit--pregnancy predicts E
## refitting model(s) with ML (instead of REML)
## Data: pregnant_df
## Models:
## unadjusted_pregnancy_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## adjusted_pregnancy_model: log(e_conc_ug) ~ age_at_sample + month + hour + preg_trim + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df
## unadjusted_pregnancy_model 6 784.51 805.52 -386.25 772.51
## adjusted_pregnancy_model 8 775.48 803.49 -379.74 759.48 13.024 2
## Pr(>Chisq)
## unadjusted_pregnancy_model
## adjusted_pregnancy_model 0.001486 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_pregnancy_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + preg_trim + (1 |
## ind_id)
## Data: pregnant_df
##
## REML criterion at convergence: 781.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.74934 -0.62059 0.01613 0.67608 3.01420
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 0.4587 0.6773
## Residual 1.1646 1.0792
## Number of obs: 245, groups: ind_id, 23
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.78746 0.52446 3.408
## age_at_sample 0.01562 0.02276 0.686
## month 0.02944 0.02363 1.246
## hour -0.07165 0.02894 -2.476
## preg_trimT2 -0.17724 0.20993 -0.844
## preg_trimT3 0.41154 0.19692 2.090
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month hour prg_T2
## age_at_smpl -0.538
## month -0.372 -0.015
## hour -0.655 0.008 0.030
## preg_trimT2 -0.293 -0.080 0.452 -0.019
## preg_trimT3 -0.311 -0.017 0.403 -0.019 0.660
Tldr: model passes diagnostics—yay
#checking for multicollinearity -- low (~ 1)
vif(adjusted_pregnancy_model)
## GVIF Df GVIF^(1/(2*Df))
## age_at_sample 1.009100 1 1.004540
## month 1.291221 1 1.136319
## hour 1.002459 1 1.001229
## preg_trim 1.301658 2 1.068130
# checking model diagnostics
plot(adjusted_pregnancy_model) #resids vs leverage -- good
lattice::qqmath(adjusted_pregnancy_model) ## qq norm plot -- good...small tail on right
plot(adjusted_pregnancy_model, # scale location plot -- good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
Only looking at adults (ages >6)
lactation_df <- e_data |>
select(ind_id, month, hour, age_at_sample, e_conc_ug, lactating) |>
filter(age_at_sample > 6)
n_lactation_df<-nrow(lactation_df)
Note: y-axis is log-transformed
ggplot(data = lactation_df, aes(x = lactating, y = log(e_conc_ug)))+
geom_boxplot()+
geom_jitter(alpha = 0.1)+
ggtitle(paste("Estradiol and lactation (N = ", n_pregnant_df, "samples)")) +
xlab("Lactating")+
ylab("Log-transformed fecal estradiol (log(ug/g))")
Data is still not normal following log transformation
shapiro.test(lactation_df$e_conc_ug)
##
## Shapiro-Wilk normality test
##
## data: lactation_df$e_conc_ug
## W = 0.26856, p-value < 2.2e-16
shapiro.test(log(lactation_df$e_conc_ug))
##
## Shapiro-Wilk normality test
##
## data: log(lactation_df$e_conc_ug)
## W = 0.99144, p-value = 6.802e-05
Data is not normal, so can’t do ANOVA. Let’s use a nonparametric approach to see if E is significantly lower during lactation….it is (W = 98014, p < 0.001)
wilcox.test(log(e_conc_ug) ~ lactating, data = lactation_df)
##
## Wilcoxon rank sum test with continuity correction
##
## data: log(e_conc_ug) by lactating
## W = 98014, p-value = 5.847e-09
## alternative hypothesis: true location shift is not equal to 0
Technically, data should be normal for this. BUT I think our sample might be large enough that we’re okay violating this assumption?
## what other control variables should I include?
unadjusted_lactation_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour + (1 | ind_id), data = e_data)
adjusted_lactation_model <- lmer(log(e_conc_ug) ~ age_at_sample + month+ hour + lactating + (1 | ind_id), data = e_data)
anova(unadjusted_lactation_model,adjusted_lactation_model) # adjusted model is better fit--lactation predicts E
## refitting model(s) with ML (instead of REML)
## Data: e_data
## Models:
## unadjusted_lactation_model: log(e_conc_ug) ~ age_at_sample + month + hour + (1 | ind_id)
## adjusted_lactation_model: log(e_conc_ug) ~ age_at_sample + month + hour + lactating + (1 | ind_id)
## npar AIC BIC logLik deviance Chisq Df
## unadjusted_lactation_model 6 4843.2 4874.7 -2415.6 4831.2
## adjusted_lactation_model 7 4803.3 4840.0 -2394.6 4789.3 41.974 1
## Pr(>Chisq)
## unadjusted_lactation_model
## adjusted_lactation_model 9.25e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(adjusted_lactation_model) # lactation = lower E
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(e_conc_ug) ~ age_at_sample + month + hour + lactating + (1 |
## ind_id)
## Data: e_data
##
## REML criterion at convergence: 4813.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1108 -0.6274 -0.0233 0.6412 3.3238
##
## Random effects:
## Groups Name Variance Std.Dev.
## ind_id (Intercept) 2.147 1.465
## Residual 1.626 1.275
## Number of obs: 1403, groups: ind_id, 44
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -1.633063 0.349677 -4.670
## age_at_sample 0.173253 0.020625 8.400
## month 0.061315 0.009774 6.273
## hour -0.022011 0.012923 -1.703
## lactatingYES -0.635455 0.094448 -6.728
##
## Correlation of Fixed Effects:
## (Intr) ag_t_s month hour
## age_at_smpl -0.587
## month -0.188 0.038
## hour -0.426 0.007 0.032
## lactatngYES 0.058 -0.182 -0.047 0.040
Tldr: model passes diagnostics—yay
#checking for multicollinearity -- low (~ 1)
vif(adjusted_lactation_model)
## age_at_sample month hour lactating
## 1.035517 1.004192 1.002907 1.038010
# checking model diagnostics
plot(adjusted_lactation_model) #resids vs leverage -- good
lattice::qqmath(adjusted_lactation_model) ## qq norm plot -- good...small tail on right
plot(adjusted_lactation_model, # scale location plot -- good
sqrt(abs(resid(.)))~fitted(.),
type=c("p","smooth"), col.line=1)
Can I please have access to these data?